Load all the packages we need
library(readxl)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.2.1 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(RColorBrewer)
library(ggplot2)
library(ggthemes)
library(dplyr)
library(choroplethr)
## Loading required package: acs
## Loading required package: XML
##
## Attaching package: 'acs'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:base':
##
## apply
library(choroplethrMaps)
library(ggbeeswarm)
library(tidyverse)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggbeeswarm)
load all the data we are going to use here and do summary of each dataset.
Summary statistics such as mean,median,mode and quartiles for each variable.
COR_Adult_CCcontact <- read_excel("COR_Adult_CCcontact.xlsx")
summary(COR_Adult_CCcontact)
## Caseload ActualValue
## Min. :2011-10-11 00:00:00 Min. :15255
## 1st Qu.:2013-08-13 00:00:00 1st Qu.:15614
## Median :2015-02-15 00:00:00 Median :16727
## Mean :2015-01-26 15:03:55 Mean :16683
## 3rd Qu.:2016-07-16 00:00:00 3rd Qu.:17531
## Max. :2017-12-17 00:00:00 Max. :18518
## Forecast ForecastValue Variance
## Min. :2011-11-11 00:00:00 Min. :15056 Min. :-1276.0
## 1st Qu.:2013-03-13 00:00:00 1st Qu.:15433 1st Qu.: 0.0
## Median :2014-02-14 00:00:00 Median :15812 Median : 64.0
## Mean :2014-03-05 08:11:42 Mean :16251 Mean : 432.3
## 3rd Qu.:2015-02-15 00:00:00 3rd Qu.:17066 3rd Qu.: 1064.0
## Max. :2017-11-17 00:00:00 Max. :18608 Max. : 2506.0
## VariancePerc
## Min. :-0.077188
## 1st Qu.: 0.000000
## Median : 0.003643
## Mean : 0.027801
## 3rd Qu.: 0.066631
## Max. : 0.160744
MS_ADO_AEM <- read_excel("MS_ADO_AEM.xlsx")
summary(MS_ADO_AEM)
## Caseload ActualValue
## Min. :2015-10-01 00:00:00 Min. : 892.0
## 1st Qu.:2016-12-01 00:00:00 1st Qu.: 963.8
## Median :2017-07-01 00:00:00 Median :1019.5
## Mean :2017-05-28 20:00:00 Mean :1006.3
## 3rd Qu.:2018-01-01 00:00:00 3rd Qu.:1046.0
## Max. :2018-05-01 00:00:00 Max. :1089.0
## Forecast ForecastValue Variance
## Min. :2016-02-01 00:00:00 Min. : 902.4 Min. :-92.000
## 1st Qu.:2016-06-01 00:00:00 1st Qu.: 945.1 1st Qu.: -2.814
## Median :2016-11-01 00:00:00 Median : 994.5 Median : 26.503
## Mean :2016-12-02 15:37:08 Mean : 982.4 Mean : 23.844
## 3rd Qu.:2017-06-01 00:00:00 3rd Qu.:1014.8 3rd Qu.: 53.417
## Max. :2018-06-01 00:00:00 Max. :1057.0 Max. :124.980
## VariancePerc
## Min. :-0.092555
## 1st Qu.:-0.002876
## Median : 0.026227
## Mean : 0.024724
## 3rd Qu.: 0.054438
## Max. : 0.129644
MS_ADO_MCS_Alien <- read_excel("MS_ADO_MCS_Alien.xlsx")
summary(MS_ADO_MCS_Alien)
## Caseload ActualValue
## Min. :2015-10-01 00:00:00 Min. :1856
## 1st Qu.:2016-12-01 00:00:00 1st Qu.:2023
## Median :2017-07-01 00:00:00 Median :2103
## Mean :2017-05-28 20:00:00 Mean :2084
## 3rd Qu.:2018-01-01 00:00:00 3rd Qu.:2174
## Max. :2018-05-01 00:00:00 Max. :2183
## Forecast ForecastValue Variance
## Min. :2016-02-01 00:00:00 Min. :1857 Min. :-51.000
## 1st Qu.:2016-06-01 00:00:00 1st Qu.:1935 1st Qu.: 4.948
## Median :2016-11-01 00:00:00 Median :2003 Median : 53.095
## Mean :2016-12-02 15:37:08 Mean :2021 Mean : 63.305
## 3rd Qu.:2017-06-01 00:00:00 3rd Qu.:2080 3rd Qu.:102.163
## Max. :2018-06-01 00:00:00 Max. :2216 Max. :250.875
## VariancePerc
## Min. :-0.026087
## 1st Qu.: 0.002354
## Median : 0.025596
## Mean : 0.031847
## 3rd Qu.: 0.050414
## Max. : 0.129844
MS_ADO_Total <- read_excel("MS_ADO_Total.xlsx")
summary(MS_ADO_Total)
## Caseload ActualValue
## Min. :2007-08-01 00:00:00 Min. :221062
## 1st Qu.:2011-04-01 00:00:00 1st Qu.:253642
## Median :2013-08-01 00:00:00 Median :256924
## Mean :2013-07-12 10:41:34 Mean :257778
## 3rd Qu.:2015-10-01 00:00:00 3rd Qu.:265125
## Max. :2018-05-01 00:00:00 Max. :280098
## Forecast ForecastValue Variance
## Min. :2007-11-01 00:00:00 Min. :221093 Min. :-65521.7
## 1st Qu.:2010-06-01 00:00:00 1st Qu.:253559 1st Qu.:-12177.7
## Median :2012-11-01 00:00:00 Median :263891 Median : -1514.9
## Mean :2012-09-20 04:34:30 Mean :264817 Mean : -7038.9
## 3rd Qu.:2015-02-01 00:00:00 3rd Qu.:273677 3rd Qu.: 648.8
## Max. :2018-06-01 00:00:00 Max. :320117 Max. : 11822.0
## VariancePerc
## Min. :-0.204681
## 1st Qu.:-0.043609
## Median :-0.005863
## Mean :-0.023138
## 3rd Qu.: 0.002553
## Max. : 0.048505
MS_ADO_QMB <- read_excel("MS_ADO_QMB.xlsx")
summary(MS_ADO_QMB)
## Caseload ActualValue
## Min. :2007-08-01 00:00:00 Min. :14443
## 1st Qu.:2007-12-01 00:00:00 1st Qu.:15106
## Median :2008-02-15 12:00:00 Median :15600
## Mean :2008-02-04 15:36:00 Mean :15402
## 3rd Qu.:2008-05-01 00:00:00 3rd Qu.:15670
## Max. :2008-06-01 00:00:00 Max. :15713
## Forecast ForecastValue Variance
## Min. :2007-11-01 00:00:00 Min. :14200 Min. : 53.0
## 1st Qu.:2007-11-01 00:00:00 1st Qu.:14404 1st Qu.: 399.8
## Median :2007-11-01 00:00:00 Median :14656 Median : 764.0
## Mean :2007-12-24 12:00:00 Mean :14709 Mean : 693.7
## 3rd Qu.:2008-02-01 00:00:00 3rd Qu.:14827 3rd Qu.: 972.6
## Max. :2008-06-01 00:00:00 Max. :15546 Max. :1178.1
## VariancePerc
## Min. :0.003683
## 1st Qu.:0.026488
## Median :0.052760
## Mean :0.047481
## 3rd Qu.:0.066396
## Max. :0.081054
ESA_TANF_Child_Only <- read_excel("ESA_TANF_Child_Only.xlsx")
summary(ESA_TANF_Child_Only)
## Caseload ActualValue
## Min. :2010-07-01 00:00:00 Min. :11847
## 1st Qu.:2013-01-01 00:00:00 1st Qu.:13865
## Median :2014-07-01 00:00:00 Median :15024
## Mean :2014-08-04 14:56:53 Mean :15976
## 3rd Qu.:2016-05-01 00:00:00 3rd Qu.:17043
## Max. :2018-07-01 00:00:00 Max. :25621
## Forecast ForecastValue Variance
## Min. :2011-06-01 00:00:00 Min. :11671 Min. :-4882.0
## 1st Qu.:2012-11-01 00:00:00 1st Qu.:13862 1st Qu.: -717.0
## Median :2014-02-01 00:00:00 Median :15801 Median : -70.0
## Mean :2014-03-02 17:03:06 Mean :16477 Mean : -501.1
## 3rd Qu.:2015-06-01 00:00:00 3rd Qu.:17649 3rd Qu.: 7.0
## Max. :2018-06-01 00:00:00 Max. :25620 Max. : 1173.0
## VariancePerc
## Min. :-0.2299684
## 1st Qu.:-0.0453534
## Median :-0.0050769
## Mean :-0.0283730
## 3rd Qu.: 0.0004377
## Max. : 0.0735534
#To not see any number like "1e+05" we will penalize the scientific notation version by 999
options(scipen=999)
let’s have a first look at the data using ggplot2.
We have used Caseload as a constant element in all graphics for the x axis and we have used ActualValue, ForecastValue, and Variance for the y axis for each dataset we did load here
ESA_TANF_Child_Only %>% ggplot(aes(x=Caseload,y=ActualValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")
Use span to control the “wiggliness” of the default loess smoother. The span is the fraction of points used to fit each local regression. small numbers make a wigglier curve, larger numbers make a smoother curve.
We added a loess smoothing curve. The span of .05 is the result of trial and error to give us the degree of smoothing we wanted.
ESA_TANF_Child_Only %>% ggplot(aes(x=Caseload,y=ForecastValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
geom_smooth(method = "loess",span=.05)
ESA_TANF_Child_Only %>% ggplot(aes(x=Caseload,y=Variance)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
geom_smooth(method = "loess",span=.05)
We did use different dataset here. We made the graph interactive using plotly.
spa1 =COR_Adult_CCcontact %>% ggplot(aes(x=Caseload,y=ActualValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")
ggplotly(spa1)
spa2 = COR_Adult_CCcontact %>% ggplot(aes(x=Caseload,y=ForecastValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
geom_smooth(method = "loess",span=.07)
ggplotly(spa2)
spa3 = COR_Adult_CCcontact %>% ggplot(aes(x=Caseload,y=Variance)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
geom_smooth(method = "loess",span=.07)
ggplotly(spa3)
spb1 =MS_ADO_AEM %>% ggplot(aes(x=Caseload,y=ActualValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")
ggplotly(spb1)
spb2 =MS_ADO_AEM %>% ggplot(aes(x=Caseload,y=ForecastValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
geom_smooth(method = "loess",span=.2)
ggplotly(spb2)
spb3 =MS_ADO_AEM %>% ggplot(aes(x=Caseload,y=Variance)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
geom_smooth(method = "loess",span=.2)
ggplotly(spb3)
spc1=MS_ADO_MCS_Alien %>% ggplot(aes(x=Caseload,y=ActualValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")
ggplotly(spc1)
spc2=MS_ADO_MCS_Alien %>% ggplot(aes(x=Caseload,y=ForecastValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
geom_smooth(method = "loess",span=.2)
ggplotly(spc2)
spc3=MS_ADO_MCS_Alien %>% ggplot(aes(x=Caseload,y=Variance)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
geom_smooth(method = "loess",span=.2)
ggplotly(spc3)
spd1=MS_ADO_Total %>% ggplot(aes(x=Caseload,y=ActualValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")
ggplotly(spd1)
spd2=MS_ADO_Total %>% ggplot(aes(x=Caseload,y=ForecastValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
geom_smooth(method = "loess",span=.05)
ggplotly(spd2)
spd3=MS_ADO_Total %>% ggplot(aes(x=Caseload,y=Variance)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
geom_smooth(method = "loess",span=.05)
ggplotly(spd3)
spe1=MS_ADO_QMB %>% ggplot(aes(x=Caseload,y=ActualValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")
ggplotly(spe1)
spe2=MS_ADO_QMB %>% ggplot(aes(x=Caseload,y=ForecastValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
geom_smooth(method = "loess",span=.2)
ggplotly(spe2)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 1.2124e+09
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.8102e+06
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 6.278e+12
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at 1.2124e+09
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 2.8102e+06
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other
## near singularities as well. 6.278e+12
spe3=MS_ADO_QMB %>% ggplot(aes(x=Caseload,y=Variance)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
geom_smooth(method = "loess",span=.2)
ggplotly(spe3)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 1.2124e+09
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.8102e+06
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 6.278e+12
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at 1.2124e+09
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 2.8102e+06
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other
## near singularities as well. 6.278e+12